#!/usr/bin/env python

"""
Download a KiDS-DR5 lens sample suitable for T3:

- Table: KiDS_DR5_0_ugriZYJHKs_cat_fits (multi-band + stellar masses)
- Columns: ID, RAJ2000, DECJ2000, Z_B, mstar_med, mstar_bestfit, A_WORLD, B_WORLD
- Cuts (in ADQL):
    10.0 <= mstar_med <= 11.5
    0.01 <= Z_B <= 0.8

Output: data/KiDS_DR5_lenssample.csv
"""

import os
import sys

import pyvo
import pandas as pd

TAP_URL = "https://archive.eso.org/tap_cat"
OUT_PATH = os.path.join("data", "KiDS_DR5_lenssample.csv")


def find_kids_dr5_table(tap):
    """Locate the KiDS-ESO-DR5 multiband table."""
    candidates = []
    for t in tap.tables:
        name = (t.name or "").lower()
        desc = (t.description or "").lower()

        score = 0
        if "kids" in desc and "dr5" in desc:
            score += 2
        if "multi-band" in desc or "multiband" in desc:
            score += 1
        if "ugri" in desc or "ugri1i2zyjhks" in desc:
            score += 1
        if "kids" in name and "dr5" in name:
            score += 2

        if score > 0:
            candidates.append((score, t.name, t.description))

    if not candidates:
        raise RuntimeError("Could not find any KiDS DR5 table in TAP metadata.")

    candidates.sort(key=lambda x: x[0], reverse=True)
    best_score, best_name, best_desc = candidates[0]

    print("[info] Best KiDS DR5 candidate table:", best_name)
    print("       description:", best_desc)
    return best_name


def main():
    # Make sure we’re in the project root
    here = os.path.abspath(os.path.dirname(__file__))
    project_root = os.path.dirname(here)
    os.chdir(project_root)

    os.makedirs("data", exist_ok=True)

    print("[info] Connecting to ESO TAP at", TAP_URL)
    tap = pyvo.dal.TAPService(TAP_URL)

    print("[info] Locating KiDS-ESO-DR5 multi-band table...")
    try:
        table_name = find_kids_dr5_table(tap)
    except RuntimeError as e:
        print("[error]", e)
        sys.exit(1)

    # ADQL query: select only the columns we actually need and
    # restrict to a sensible mass + redshift window.
    adql = f"""
        SELECT
            ID,
            RAJ2000,
            DECJ2000,
            Z_B,
            mstar_med,
            mstar_bestfit,
            A_WORLD,
            B_WORLD
        FROM {table_name}
        WHERE
            mstar_med BETWEEN 10.0 AND 11.5
            AND Z_B BETWEEN 0.01 AND 0.8
    """

    print("[info] Submitting ADQL query to table:", table_name)
    # Ask for a generous maxrec so we’re not limited to 20k rows
    job = tap.submit_job(adql, maxrec=1000000)
    job.run()
    job.wait()

    print("[info] Query finished, fetching results...")
    result = job.fetch_result()

    print("[info] Converting to pandas DataFrame...")
    tbl = result.to_table()
    df = tbl.to_pandas()

    print("[info] Rows returned:", len(df))

    # Basic cleanup: drop obvious missing/flagged masses and sizes
    # (we can refine this later if needed)
    # mstar_med is log10(M*/M_sun); sometimes bad values might be <= 0 or NaN.
    mask = (
        df["mstar_med"].replace([-99, -999], pd.NA).notna()
        & (df["mstar_med"] > 9.0)
        & df["Z_B"].notna()
        & (df["Z_B"] > 0.0)
        & df["A_WORLD"].notna()
        & df["B_WORLD"].notna()
        & (df["A_WORLD"] > 0.0)
        & (df["B_WORLD"] > 0.0)
    )
    df = df.loc[mask].copy()
    print("[info] Rows after basic quality cuts:", len(df))

    print("[info] Writing CSV to", OUT_PATH)
    df.to_csv(OUT_PATH, index=False)
    print("[info] Done. Wrote", len(df), "rows to", OUT_PATH)


if __name__ == "__main__":
    main()
